#Load libraries
library(tidyverse)
library(ggmap)
library(keyring)
library(rpart)
library(partykit)
library(scales)
#Get data
tuesdata <- tidytuesdayR::tt_load('2025-02-18')
agencies <- tuesdata$agenciesTidy Tuesday 2-18-2025
Set up
Explore the Data
glimpse(agencies)Rows: 19,166
Columns: 10
$ ori <chr> "AL0430200", "AL0430100", "AL0430000", "AL0070100", "…
$ county <chr> "LEE", "LEE", "LEE", "BIBB", "BIBB", "BIBB", "BIBB", …
$ latitude <dbl> 32.60406, 32.60809, 32.60406, 33.01589, 32.94000, 33.…
$ longitude <dbl> -85.35305, -85.47514, -85.35305, -87.12715, -87.11683…
$ state_abbr <chr> "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL", "AL",…
$ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama"…
$ agency_name <chr> "Opelika Police Department", "Auburn Police Departmen…
$ agency_type <chr> "City", "City", "County", "City", "County", "City", "…
$ is_nibrs <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ nibrs_start_date <date> 2021-01-01, 2020-12-01, 2012-04-01, 2020-01-01, 2021…
#Explore the states included in the data
agencies %>% group_by(state) %>% summarize(count=n()) %>% arrange(desc(count))All fifty states are included in the dataset however each state varies in the amount of information available.
#Explore the types of agencies in the dataset
ggplot(data=agencies, aes(x=agency_type)) + geom_bar() +
labs(x='Agency Type', y='Count', title = 'Agency Type Counts') +
scale_y_continuous(labels = label_comma()) +
theme(axis.text.x = element_text(angle = 45, vjust = 0.75))As expected, we have a large number of city and county agencies. Agencies like State Police, Other State Agencies, Tribal, and University or College are expected. Let’s take a look at the Unknown, and NA agency types
#Pull all values in the Unknown and NA type
agencies %>% filter(agency_type %in% c('Unknown','Other',NA))#Create Education agency type to take relevant agencies and the College and University type
agencies$agency_type <- ifelse(agencies$agency_type == 'University or College' |
(agencies$agency_type %in% c('Unknown','Other',NA) &
(str_detect(agencies$agency_name, 'University') |
str_detect(agencies$agency_name, 'College') |
str_detect(agencies$agency_name, 'Institute') |
str_detect(agencies$agency_name, 'School') |
str_detect(agencies$agency_name, 'Education'))),
'Education', agencies$agency_type)
#Create agency_type for Law Enforcement and move State Police into that category
agencies$agency_type <- ifelse(agencies$agency_type == 'State Police' |
(agencies$agency_type %in% c('Unknown','Other',NA) &
(str_detect(agencies$agency_name, 'Enforcement') |
str_detect(agencies$agency_name, 'Drug') |
str_detect(agencies$agency_name, 'Narcotics') |
str_detect(agencies$agency_name, 'Alcohol') |
str_detect(agencies$agency_name, 'Liquor') |
str_detect(agencies$agency_name, 'Corrections') |
str_detect(agencies$agency_name, 'Highway Patrol') |
str_detect(agencies$agency_name, 'District Attorney') |
str_detect(agencies$agency_name, 'Attorney General') |
str_detect(agencies$agency_name, 'Criminal') |
str_detect(agencies$agency_name, 'Public Safety') |
str_detect(agencies$agency_name, 'State Patrol') |
str_detect(agencies$agency_name, 'Court') |
str_detect(agencies$agency_name, 'Crimes') |
str_detect(agencies$agency_name, 'Investigation') |
str_detect(agencies$agency_name, 'Detective') |
str_detect(agencies$agency_name, 'Constable') |
str_detect(agencies$agency_name, 'Police'))),
'Law Enforcement',agencies$agency_type)
#Add agencies to the Tribal agency_type
agencies$agency_type <- ifelse(agencies$agency_type %in% c('Unknown','Other',NA) &
(str_detect(agencies$agency_name, 'Tribal') |
str_detect(agencies$agency_name, 'Indian') |
str_detect(agencies$agency_name, 'Hopi'))
, 'Tribal', agencies$agency_type)
#Collapse Unknown and NA into Other
agencies$agency_type <- ifelse(agencies$agency_type %in% c('Unknown','Other',NA),
'Other', agencies$agency_type)
ggplot(data=agencies, aes(x=agency_type)) + geom_bar() +
scale_y_continuous(labels = label_comma()) +
labs(x='Agency Type', y='Count', title = 'Agency Type Counts')Now, let’s explore the nibrs_start_date to see when agencies began reporting data to NIBRS
#Confirm all reporting agencies have a start date
agencies %>% filter(is_nibrs, is.na(nibrs_start_date))#Plot the year that NIBRS began receiving data
ggplot(agencies %>% filter(is_nibrs), aes(x=year(nibrs_start_date))) + geom_bar() +
labs(x='Start Year', y='Count', title='Start Year Distribution') +
scale_y_continuous(labels = label_comma())It looks like NIBRS has been receiving data since 1990, however the amount of data greatly increased beginning in the late 2010s. Now, let’s look at the breakdown by agency_type
#First, normalize the number of reporting agencies each year by agency_type
agencies_summary <- agencies %>% filter(is_nibrs) %>%
group_by(agency_type, year(nibrs_start_date)) %>%
summarise(count = n()) %>%
mutate(percentage = count / sum(count)) %>%
rename('year'='year(nibrs_start_date)')
#Create bar plot with percentages, faceted by agency_type
ggplot(agencies_summary, aes(x = year, y = percentage)) +
geom_bar(stat = "identity") +
labs(x='Year',y='Percentage') +
scale_y_continuous(labels = label_percent()) +
facet_wrap(~agency_type)With the exception of ‘Other State Agency’, all agency types increased their reporting around the late 2010s.
Visualize the Agencies Geographically
Now, let’s take a look at the locations of each agency using the latitude and longitude.
Firstly, for simplicity, we’ll look at only the continental US. Let’s check if any latitude or longitude lines are outside the boundary of the continental US and are not Hawaii or Alaska
#Continental US bounding box
us_box <- c(left = -125, bottom = 24.5, right = -67, top = 49)
#Explore points outside of the continuental US bouding box
agencies %>% filter(latitude < us_box['bottom'] | latitude > us_box['top'] |
longitude < us_box['left'] | longitude > us_box['right']) %>%
filter(!(state %in% c('Hawaii', 'Alaska')))There is one agency in Kentucky that has apparently incorrect latitude and longitude. These values are likely a placehold. We will remove this datapoint from the dataframe.
agencies <- agencies %>% filter(ori != 'KY0710900')register_stadiamaps(key=key_get(service = "stadia", username="api-key"),write=F)
#Create the map
us <- get_stadiamap(us_box, zoom = 5, maptype = "alidade_smooth")
#Plot the agencies
ggmap(us) + geom_point(data=agencies, aes(x=longitude, y=latitude), size=0.05) +
labs(x='',y='', title='Agencies in the Continental US') +
theme(axis.text.y=element_blank(), axis.text.x=element_blank(),
axis.ticks.y=element_blank(), axis.ticks.x=element_blank())Now, let’s look at the city/county breakdown on the map
#Plot the City and County agencies together
ggmap(us) + geom_point(data=agencies %>% filter(agency_type %in% c('County','City')),
aes(x=longitude, y=latitude, color=agency_type), size=0.05) +
labs(x='',y='', title='City and County Agencies in the Continental US',
color='Agency Type') + theme(axis.text.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.y=element_blank(), axis.ticks.x=element_blank())#Plot the City and County agencies individually
ggmap(us) + geom_point(data=agencies %>% filter(agency_type %in% c('County','City')),
aes(x=longitude, y=latitude, color=agency_type), size=0.05) +
facet_wrap(~agency_type) +
labs(x='',y='', title='City and County Agencies in the Continental US',
color='Agency Type') + theme(axis.text.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.y=element_blank(), axis.ticks.x=element_blank())Let’s look at the year that the agency began reporting data to NIBRS
ggmap(us) + geom_point(data=agencies %>% filter(is_nibrs),
aes(x=longitude, y=latitude, color=year(nibrs_start_date)),
size=0.05) +
labs(x='',y='', title='Agency Start Years in the Continental US',
color='Start Year') + theme(axis.text.y=element_blank(),
axis.text.x=element_blank(),
axis.ticks.y=element_blank(), axis.ticks.x=element_blank())These values appear organized by state and region. Let’s see if we can build a model to predict the nibrs_start_date by the state
#Set random seed
set.seed(1817)
#Filter only for nibr agencies and create start_year variable
agencies_nibr <- agencies %>% filter(is_nibrs)
agencies_nibr$start_year <- year(agencies_nibr$nibrs_start_date)
#Divide data into training (80%) and testing (20%)
n <- nrow(agencies_nibr)
test_index <- sample.int(n,size=round(n*0.2))
train_agencies <- agencies_nibr[-test_index,]
test_agencies <- agencies_nibr[test_index,]
#Create a decision tree
date_tree <- rpart(start_year~state, data=train_agencies, cp=0.0002)
#Plot the decision tree
plot(as.party(date_tree))#Use the decision tree to make predictions for the test data
test_agencies$preds <- round(predict(date_tree, newdata = test_agencies))
#Create confusion matrix
confusion <- table(test_agencies$start_year,test_agencies$preds, dnn=c("actual", "predicted"))
confusion predicted
actual 1995 1997 1998 2001 2002 2004 2006 2007 2010 2012 2013 2016 2018 2019
1991 0 1 60 0 0 0 0 0 0 0 0 0 0 0
1992 17 44 6 0 0 0 0 0 0 0 0 0 0 0
1993 3 2 1 2 0 0 0 0 0 5 0 0 0 0
1994 0 34 0 1 0 0 0 0 0 0 0 0 0 0
1995 1 11 49 0 0 12 0 0 0 0 0 0 0 0
1996 0 12 31 1 0 4 0 0 0 0 0 0 0 0
1997 0 9 37 0 31 3 0 0 1 1 0 0 0 0
1998 0 9 43 28 1 2 7 4 0 1 0 3 0 2
1999 0 8 39 30 10 4 2 10 0 0 0 0 0 1
2000 0 7 14 3 75 13 0 5 2 0 0 0 0 1
2001 0 3 5 9 4 14 1 12 0 0 0 2 0 0
2002 0 0 2 19 0 6 6 9 1 2 0 4 0 0
2003 0 1 3 8 3 1 53 6 1 5 0 2 1 0
2004 0 0 11 4 4 8 20 9 0 0 0 2 1 0
2005 0 1 1 1 4 1 14 15 0 2 0 1 3 1
2006 0 0 1 2 3 0 4 19 0 2 0 5 0 5
2007 0 2 1 0 0 2 1 9 0 0 0 2 2 2
2008 0 1 0 0 2 2 6 6 1 1 35 2 1 2
2009 0 1 0 0 3 1 1 3 54 9 2 7 0 2
2010 0 0 1 0 1 2 4 5 2 5 2 4 0 0
2011 0 1 2 0 2 0 1 0 0 24 8 1 0 1
2012 0 0 1 0 5 1 3 7 1 5 6 1 0 2
2013 0 0 0 0 1 0 1 6 1 0 5 3 1 1
2014 0 0 0 1 1 1 1 3 2 17 3 1 0 0
2015 0 0 3 0 1 2 4 2 2 8 3 6 3 3
2016 0 1 1 0 0 0 2 3 0 9 0 2 1 8
2017 1 1 1 0 6 0 5 0 0 3 10 2 2 17
2018 1 1 3 0 1 1 2 2 2 10 5 8 1 55
2019 0 1 1 0 1 2 1 2 1 2 4 4 2 205
2020 0 6 3 1 1 2 4 5 0 4 2 24 16 129
2021 0 3 2 0 3 5 5 12 0 8 11 38 28 95
2022 0 3 3 0 1 0 1 5 1 2 0 4 1 15
2023 0 0 4 0 0 1 2 1 2 2 2 0 1 64
predicted
actual 2020 2021
1991 0 0
1992 0 0
1993 0 0
1994 0 0
1995 0 0
1996 0 0
1997 0 0
1998 0 0
1999 0 0
2000 0 0
2001 0 0
2002 0 0
2003 0 0
2004 1 0
2005 0 0
2006 0 0
2007 0 0
2008 0 0
2009 0 0
2010 2 0
2011 0 0
2012 0 0
2013 0 5
2014 1 2
2015 2 1
2016 2 0
2017 3 0
2018 11 0
2019 31 10
2020 83 40
2021 55 268
2022 13 136
2023 7 49
#Calculate prediction accuracy
sum(diag(confusion)) / sum(confusion)[1] 0.01957532
The model has a 2% accuracy rate, which is very poor. However, in reviewing the Confusion matrix, it appears that the model is frequently off by only a year or two. Therefore, let’s check if the model is more accurate after we bucket the years.
#Create start range that buckets start_date into five year groups
test_agencies$start_range <-
cut(test_agencies$start_year, c(1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 3000),
labels = c('1985-1990', '1991-1995', '1996-2000', '2001-2005', '2006-2010', '2011-2015',
'2016-2020', '2020+'), include.lowest=TRUE)
test_agencies$start_range_preds <-
cut(test_agencies$preds, c(1985, 1990, 1995, 2000, 2005, 2010, 2015, 2020, 3000),
labels = c('1985-1990', '1991-1995', '1996-2000', '2001-2005', '2006-2010', '2011-2015',
'2016-2020', '2020+'), include.lowest=TRUE)
#Create confusion matrix
confusion_range <- table(test_agencies$start_range,test_agencies$start_range_preds, dnn=c("actual", "predicted"))
confusion_range predicted
actual 1985-1990 1991-1995 1996-2000 2001-2005 2006-2010 2011-2015
1985-1990 0 0 0 0 0 0
1991-1995 0 21 208 15 0 5
1996-2000 0 0 209 205 31 2
2001-2005 0 0 27 86 147 9
2006-2010 0 0 7 18 115 56
2011-2015 0 0 7 15 34 79
2016-2020 0 2 19 15 29 49
2020+ 0 0 15 10 29 25
predicted
actual 2016-2020 2020+
1985-1990 0 0
1991-1995 0 0
1996-2000 7 0
2001-2005 18 0
2006-2010 36 0
2011-2015 26 8
2016-2020 606 50
2020+ 321 453
#Calculate prediction accuracy
sum(diag(confusion_range)) / sum(confusion_range)[1] 0.5205707
Bucketing the years drastically improved the accuracy rate of the model to 52%. Let’s see, on average, how far the prediction is from actual value.
mean(abs(test_agencies$start_year - test_agencies$preds))[1] 3.093232
Though this model only has a 2% accuracy rate, the predictions are, on average, within 4 years of the actual value.